import pandas as pd
import numpy as np
from sklearn.cluster import AffinityPropagation, AgglomerativeClustering, k_means
from sklearn_extra.cluster import KMedoids
from sklearn.metrics.cluster import (homogeneity_score, silhouette_score, davies_bouldin_score,
adjusted_rand_score, calinski_harabasz_score, adjusted_mutual_info_score,
v_measure_score, completeness_score)
from sklearn.datasets import make_blobs
from collections import defaultdict
import itertools
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
dataset = make_blobs(n_samples=1000, cluster_std=[0.25, 1, 2.5], random_state=42)
data, labels = dataset
clustering_algorithms = (AffinityPropagation, AgglomerativeClustering, KMedoids, k_means)
clustering_algorithms_titles = ('AffinityPropagation', 'AgglomerativeClustering', 'KMedoids', 'KMeans++')
clustering_metrics = (homogeneity_score, silhouette_score, davies_bouldin_score, adjusted_rand_score,
calinski_harabasz_score, adjusted_mutual_info_score, v_measure_score, completeness_score)
clustering_metrics_titles = ('homogeneity_score', 'silhouette_score', 'davies_bouldin_score',
'adjusted_rand_index', 'calinski_harabasz_score', 'adjusted_mutual_info',
'v_measure_score', 'completeness_score'
)
labels_mapping = {}
metrics_dataframe = pd.DataFrame(index=clustering_algorithms_titles, columns=clustering_metrics_titles)
for clustering_algorithm, clustering_algorithms_title in zip(clustering_algorithms, clustering_algorithms_titles):
try:
algorithm = clustering_algorithm()
except:
algorithm = clustering_algorithm(data, 2)
try:
tmp_predictions = algorithm.fit_predict(data)
except:
tmp_predictions = algorithm[1]
labels_mapping[clustering_algorithms_title] = tmp_predictions
for metric, metric_title in zip(clustering_metrics, clustering_metrics_titles):
if metric_title not in ('silhouette_score', 'davies_bouldin_score', 'calinski_harabasz_score'):
metrics_dataframe.loc[clustering_algorithms_title, metric_title] = metric(labels, tmp_predictions)
else:
metrics_dataframe.loc[clustering_algorithms_title, metric_title] = metric(data, labels)
metrics_dataframe.index.name = 'clustering_algorithm'
metrics_dataframe
def highlight_min(data, color='yellow'):
attr = 'background-color: {}'.format(color)
data = data.replace('%','', regex=True).astype(float)
if data.ndim == 1:
bound = data == data.min()
return [attr if row else '' for row in bound]
else:
bound = data == data.min().min()
return pd.DataFrame(np.where(bound, attr, ''), index=data.index, columns=data.columns)
def highlight_max(data, color='yellow'):
attr = 'background-color: {}'.format(color)
data = data.replace('%','', regex=True).astype(float)
if data.ndim == 1:
bound = data == data.max()
return [attr if row else '' for row in bound]
else:
bound = data == data.max().max()
return pd.DataFrame(np.where(bound, attr, ''), index=data.index, columns=data.columns)
metrics_best_mapping = {'davies_bouldin_score': highlight_min,
'calinski_harabasz_score': highlight_max,
'silhouette_score': highlight_max,
'homogeneity_score': highlight_max,
'adjusted_rand_index': highlight_max,
'adjusted_mutual_info': highlight_max,
'v_measure_score': highlight_max,
'completeness_score': highlight_max
}
metrics_min = ['davies_bouldin_score']
metrics_max = list(set(clustering_metrics_titles) - set(metrics_min))
metrics_dataframe[metrics_min].style.apply(highlight_min)
metrics_dataframe[metrics_max].style.apply(highlight_max)
KMedoids seems to be the best algorithm, according to clusterization metrics
fig, axs = plt.subplots(nrows=len(clustering_algorithms_titles), ncols=1, figsize=(10, 20))
idx = 0
for clustering_algorithm, labels in labels_mapping.items():
colors_mapping = [(item/255.) for item in labels]
axs[idx].scatter(data[:, 0], data[:, 1], c=colors_mapping);
axs[idx].set_title('{}, number of selected clusters: {}'.format(clustering_algorithm, len(set(labels))))
idx += 1
clusters_range = range(2, 10)
kmeans_labels_mapping = {}
metrics_dataframe_kmeans = pd.DataFrame(index=clusters_range, columns=clustering_metrics_titles)
for nclusters in clusters_range:
algorithm = k_means(data, nclusters)
kmeans_predictions = algorithm[1]
kmeans_labels_mapping[nclusters] = kmeans_predictions
for metric, metric_title in zip(clustering_metrics, clustering_metrics_titles):
if metric_title not in ('silhouette_score', 'davies_bouldin_score', 'calinski_harabasz_score'):
metrics_dataframe_kmeans.loc[nclusters, metric_title] = metric(labels, kmeans_predictions)
else:
metrics_dataframe_kmeans.loc[nclusters, metric_title] = metric(data, labels)
metrics_dataframe_kmeans.index.name = 'nclusters'
metrics_dataframe_kmeans
metrics_dataframe_kmeans[metrics_min].style.apply(highlight_min)
metrics_dataframe_kmeans[metrics_max].style.apply(highlight_max)
Seems like 9 clusters is the best parameter for KMeans++ algorithm
fig, axs = plt.subplots(nrows=len(clusters_range), ncols=1, figsize=(10, 30))
idx = 0
for nclusters, labels in kmeans_labels_mapping.items():
colors_mapping = [(item/255.) for item in labels]
axs[idx].scatter(data[:, 0], data[:, 1], c=colors_mapping);
axs[idx].set_title('KMeans++, number of selected clusters: {}'.format(nclusters, ))
idx += 1
clusters_range = range(2, 10)
distance_metrics = ('euclidean', 'l1', 'l2', 'manhattan', 'cosine')
agglomerative_clustering_labels_mapping = defaultdict(dict)
index = pd.MultiIndex.from_product((clusters_range, distance_metrics))
metrics_dataframe_agglomerative = pd.DataFrame(index=index, columns=clustering_metrics_titles)
for nclusters, distance_metric in itertools.product(clusters_range, distance_metrics):
algorithm = AgglomerativeClustering(n_clusters=nclusters, affinity=distance_metric, linkage='average')
agglomerative_predictions = algorithm.fit_predict(data)
agglomerative_clustering_labels_mapping[nclusters][distance_metric] = agglomerative_predictions
for metric, metric_title in zip(clustering_metrics, clustering_metrics_titles):
if metric_title not in ('silhouette_score', 'davies_bouldin_score', 'calinski_harabasz_score'):
metrics_dataframe_agglomerative.loc[(nclusters, distance_metric), metric_title] = metric(
labels, agglomerative_predictions)
else:
metrics_dataframe_agglomerative.loc[(nclusters, distance_metric), metric_title] = metric(data, labels)
metrics_dataframe_agglomerative.index.names = ['nclusters', 'affinity']
metrics_dataframe_agglomerative
metrics_dataframe_agglomerative[metrics_min].style.apply(highlight_min)
metrics_dataframe_agglomerative[metrics_max].style.apply(highlight_max)
Seems like 8 clusters and Euclidean or L2 are the best for Agglomerative Clustering algorithm
fig, axs = plt.subplots(nrows=len(clusters_range), ncols=len(distance_metrics), figsize=(100, 120))
idx = 0
for i, data_agg in enumerate(agglomerative_clustering_labels_mapping.items()):
nclusters, data_dict = data_agg
for j, data_agg1 in enumerate(data_dict.items()):
affinity, labels = data_agg1
colors_mapping = [(item/255.) for item in labels]
axs[i][j].scatter(data[:, 0], data[:, 1], c=colors_mapping);
axs[i][j].set_title('Agglomerative Clustering, number of selected clusters: {}, affinity: {}'.format(
nclusters, affinity))
damping_range = np.arange(0.5, 1, 0.1)
affinity_labels_mapping = {}
metrics_dataframe_affinity = pd.DataFrame(index=damping_range, columns=clustering_metrics_titles)
for damping in damping_range:
algorithm = AffinityPropagation(damping=damping)
affinity_predictions = algorithm.fit_predict(data)
affinity_labels_mapping[damping] = affinity_predictions
for metric, metric_title in zip(clustering_metrics, clustering_metrics_titles):
if metric_title not in ('silhouette_score', 'davies_bouldin_score', 'calinski_harabasz_score'):
metrics_dataframe_affinity.loc[damping, metric_title] = metric(labels, affinity_predictions)
else:
metrics_dataframe_affinity.loc[damping, metric_title] = metric(data, labels)
metrics_dataframe_affinity.index.name = 'damping'
metrics_dataframe_affinity
metrics_dataframe_affinity[metrics_min].style.apply(highlight_min)
metrics_dataframe_affinity[metrics_max].style.apply(highlight_max)
Seems like 0.9 damping is the best parameter for AffinityPropagation algorithm
fig, axs = plt.subplots(nrows=len(damping_range), ncols=1, figsize=(10, 30))
idx = 0
for damping, labels in affinity_labels_mapping.items():
colors_mapping = [(item/255.) for item in labels]
axs[idx].scatter(data[:, 0], data[:, 1], c=colors_mapping);
axs[idx].set_title('Affinity Propagation, damping: {}'.format(damping, ))
idx += 1
clusters_range = range(2, 10)
distance_metrics = ('euclidean', 'l1', 'l2', 'manhattan', 'cosine')
kmedoids_clustering_labels_mapping = defaultdict(dict)
index = pd.MultiIndex.from_product((clusters_range, distance_metrics))
metrics_dataframe_kmedoids = pd.DataFrame(index=index, columns=clustering_metrics_titles)
for nclusters, distance_metric in itertools.product(clusters_range, distance_metrics):
algorithm = KMedoids(n_clusters=nclusters, metric=distance_metric)
kmedoids_predictions = algorithm.fit_predict(data)
kmedoids_clustering_labels_mapping[nclusters][distance_metric] = kmedoids_predictions
for metric, metric_title in zip(clustering_metrics, clustering_metrics_titles):
if metric_title not in ('silhouette_score', 'davies_bouldin_score', 'calinski_harabasz_score'):
metrics_dataframe_kmedoids.loc[(nclusters, distance_metric), metric_title] = metric(
labels, kmedoids_predictions)
else:
metrics_dataframe_kmedoids.loc[(nclusters, distance_metric), metric_title] = metric(data, labels)
metrics_dataframe_kmedoids.index.names = ['nclusters', 'affinity']
metrics_dataframe_kmedoids
metrics_dataframe_kmedoids[metrics_min].style.apply(highlight_min)
metrics_dataframe_kmedoids[metrics_max].style.apply(highlight_max)
Seems like 5 clusters and Euclidean or L2 are the best for KMedoids Clustering algorithm
fig, axs = plt.subplots(nrows=len(clusters_range), ncols=len(distance_metrics), figsize=(100, 120))
idx = 0
for i, data_agg in enumerate(kmedoids_clustering_labels_mapping.items()):
nclusters, data_dict = data_agg
for j, data_agg1 in enumerate(data_dict.items()):
metric, labels = data_agg1
colors_mapping = [(item/255.) for item in labels]
axs[i][j].scatter(data[:, 0], data[:, 1], c=colors_mapping);
axs[i][j].set_title('KMedoids Clustering, number of selected clusters: {}, metric: {}'.format(
nclusters, metric))